Анализ временных рядов - данный метод позволяет проанализировать изменения продаж во времени и выявить цикличность или сезонность в данных.¶

Импорт библиотек

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px

Чтение данных из эксель файла

In [2]:
df = pd.read_excel('Задания_1_2.xlsx')
df.Date = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)
df
Out[2]:
series1
Date
2015-01-01 1006.699649
2015-01-02 3197.751826
2015-01-03 3217.491035
2015-01-04 2151.573759
2015-01-05 4243.929892
... ...
2019-06-26 4007.059387
2019-06-27 4836.106157
2019-06-28 4895.323783
2019-06-29 4086.016222
2019-06-30 3572.796793

1642 rows × 1 columns

Построение графика

In [3]:
fig = px.line(df, x = df.index, y = 'series1')
fig.show()
In [4]:
from statsmodels.tsa.stattools import adfuller, kpss
result = adfuller(df['series1'], regression='n', autolag='BIC')
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))
ADF Statistic: -1.203086
p-value: 0.209619
Critical Values:
	1%: -2.567
	5%: -1.941
	10%: -1.617
In [5]:
result_kpss = kpss(df['series1'], regression='ct')
print('ADF Statistic: %f' % result_kpss[0])
print('p-value: %f' % result_kpss[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))
ADF Statistic: 0.181991
p-value: 0.022753
Critical Values:
	1%: -2.567
	5%: -1.941
	10%: -1.617
In [6]:
from scipy import stats
from scipy.special import inv_boxcox

# Преобразование
df['boxcox'], lambda_value = stats.boxcox(df['series1'])
df['boxcox_shifted_S'] = df.boxcox - df.boxcox.shift(12)
df['boxcox_shifted'] = df.boxcox_shifted_S- df.boxcox_shifted_S.shift(1)
In [7]:
non_nan_values = df['boxcox_shifted'].dropna()
non_nan_values = pd.DataFrame({'non_nan_values': non_nan_values})
non_nan_values
Out[7]:
non_nan_values
Date
2015-01-14 -2.144242
2015-01-15 0.048067
2015-01-16 0.641200
2015-01-17 -1.425537
2015-01-18 0.012073
... ...
2019-06-26 -0.271553
2019-06-27 0.890321
2019-06-28 0.036524
2019-06-29 -0.878101
2019-06-30 0.124530

1629 rows × 1 columns

In [8]:
from statsmodels.tsa.stattools import adfuller, kpss
result_kpss = kpss(non_nan_values['non_nan_values'], regression='ct')
print('ADF Statistic: %f' % result_kpss[0])
print('p-value: %f' % result_kpss[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))
ADF Statistic: 0.051231
p-value: 0.100000
Critical Values:
	1%: -2.567
	5%: -1.941
	10%: -1.617
C:\Users\user\AppData\Roaming\Python\Python39\site-packages\statsmodels\tsa\stattools.py:2022: InterpolationWarning:

The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.


In [9]:
from statsmodels.tsa.stattools import adfuller, kpss
result = adfuller(non_nan_values['non_nan_values'], regression='c', autolag='AIC')
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
print(result_kpss)
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))
ADF Statistic: -13.725321
p-value: 0.000000
Critical Values:
(0.05123085365144317, 0.1, 69, {'10%': 0.119, '5%': 0.146, '2.5%': 0.176, '1%': 0.216})
	1%: -3.434
	5%: -2.863
	10%: -2.568
In [10]:
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf

# Plot acf and pacf
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5), dpi=80)
plot_acf(non_nan_values)
plot_pacf(non_nan_values, lags=40, method='ywm')
ax1.tick_params(axis='both', labelsize=12)
ax2.tick_params(axis='both', labelsize=12)
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [11]:
train = non_nan_values.iloc[:-int(len(non_nan_values) * 0.2)]
test = non_nan_values.iloc[-int(len(non_nan_values) * 0.2):]
In [12]:
import pmdarima as pm
mymodel = pm.auto_arima(
    train, 
    start_p = 1, start_q = 1,
    max_p = 5, max_q = 5,
    
    seasonal = True, m = 12,
    trace=True,
    suppress_warnings=True,
    max_P = 2, max_Q = 2,
    
    max_D = 2, max_d=2,
    alpha=0.05,
    test = 'kpss',
    seasonal_test='ocsb',
    
    n_fits=100,
    stepwise=False,
    information_criterion='bic',
    out_of_sample_size=7
    ) 
print(mymodel.summary())
 ARIMA(0,0,0)(0,0,0)[12] intercept   : BIC=2993.211, Time=0.19 sec
 ARIMA(0,0,0)(0,0,1)[12] intercept   : BIC=inf, Time=1.55 sec
 ARIMA(0,0,0)(0,0,2)[12] intercept   : BIC=inf, Time=4.32 sec
 ARIMA(0,0,0)(1,0,0)[12] intercept   : BIC=2614.715, Time=0.81 sec
 ARIMA(0,0,0)(1,0,1)[12] intercept   : BIC=inf, Time=1.78 sec
 ARIMA(0,0,0)(1,0,2)[12] intercept   : BIC=inf, Time=2.86 sec
 ARIMA(0,0,0)(2,0,0)[12] intercept   : BIC=2504.470, Time=1.39 sec
 ARIMA(0,0,0)(2,0,1)[12] intercept   : BIC=inf, Time=4.61 sec
 ARIMA(0,0,0)(2,0,2)[12] intercept   : BIC=inf, Time=6.13 sec
 ARIMA(0,0,1)(0,0,0)[12] intercept   : BIC=2647.221, Time=0.37 sec
 ARIMA(0,0,1)(0,0,1)[12] intercept   : BIC=inf, Time=2.80 sec
 ARIMA(0,0,1)(0,0,2)[12] intercept   : BIC=inf, Time=4.92 sec
 ARIMA(0,0,1)(1,0,0)[12] intercept   : BIC=2337.737, Time=0.70 sec
 ARIMA(0,0,1)(1,0,1)[12] intercept   : BIC=inf, Time=2.82 sec
 ARIMA(0,0,1)(1,0,2)[12] intercept   : BIC=inf, Time=6.24 sec
 ARIMA(0,0,1)(2,0,0)[12] intercept   : BIC=2108.279, Time=1.91 sec
 ARIMA(0,0,1)(2,0,1)[12] intercept   : BIC=inf, Time=5.57 sec
 ARIMA(0,0,1)(2,0,2)[12] intercept   : BIC=inf, Time=7.42 sec
 ARIMA(0,0,2)(0,0,0)[12] intercept   : BIC=inf, Time=1.15 sec
 ARIMA(0,0,2)(0,0,1)[12] intercept   : BIC=inf, Time=2.55 sec
 ARIMA(0,0,2)(0,0,2)[12] intercept   : BIC=inf, Time=5.13 sec
 ARIMA(0,0,2)(1,0,0)[12] intercept   : BIC=2249.351, Time=1.00 sec
 ARIMA(0,0,2)(1,0,1)[12] intercept   : BIC=inf, Time=3.11 sec
 ARIMA(0,0,2)(1,0,2)[12] intercept   : BIC=inf, Time=7.05 sec
 ARIMA(0,0,2)(2,0,0)[12] intercept   : BIC=2079.198, Time=2.74 sec
 ARIMA(0,0,2)(2,0,1)[12] intercept   : BIC=inf, Time=5.45 sec
 ARIMA(0,0,3)(0,0,0)[12] intercept   : BIC=inf, Time=1.86 sec
 ARIMA(0,0,3)(0,0,1)[12] intercept   : BIC=inf, Time=3.14 sec
 ARIMA(0,0,3)(0,0,2)[12] intercept   : BIC=inf, Time=7.21 sec
 ARIMA(0,0,3)(1,0,0)[12] intercept   : BIC=2246.229, Time=1.26 sec
 ARIMA(0,0,3)(1,0,1)[12] intercept   : BIC=inf, Time=5.06 sec
 ARIMA(0,0,3)(2,0,0)[12] intercept   : BIC=2085.636, Time=3.85 sec
 ARIMA(0,0,4)(0,0,0)[12] intercept   : BIC=inf, Time=3.06 sec
 ARIMA(0,0,4)(0,0,1)[12] intercept   : BIC=inf, Time=4.29 sec
 ARIMA(0,0,4)(1,0,0)[12] intercept   : BIC=2251.691, Time=1.62 sec
 ARIMA(0,0,5)(0,0,0)[12] intercept   : BIC=inf, Time=2.21 sec
 ARIMA(1,0,0)(0,0,0)[12] intercept   : BIC=2895.313, Time=0.28 sec
 ARIMA(1,0,0)(0,0,1)[12] intercept   : BIC=inf, Time=1.60 sec
 ARIMA(1,0,0)(0,0,2)[12] intercept   : BIC=inf, Time=4.79 sec
 ARIMA(1,0,0)(1,0,0)[12] intercept   : BIC=2507.114, Time=0.67 sec
 ARIMA(1,0,0)(1,0,1)[12] intercept   : BIC=inf, Time=3.17 sec
 ARIMA(1,0,0)(1,0,2)[12] intercept   : BIC=inf, Time=3.99 sec
 ARIMA(1,0,0)(2,0,0)[12] intercept   : BIC=2331.320, Time=2.14 sec
 ARIMA(1,0,0)(2,0,1)[12] intercept   : BIC=inf, Time=5.05 sec
 ARIMA(1,0,0)(2,0,2)[12] intercept   : BIC=inf, Time=5.68 sec
 ARIMA(1,0,1)(0,0,0)[12] intercept   : BIC=inf, Time=1.38 sec
 ARIMA(1,0,1)(0,0,1)[12] intercept   : BIC=inf, Time=2.98 sec
 ARIMA(1,0,1)(0,0,2)[12] intercept   : BIC=inf, Time=6.11 sec
 ARIMA(1,0,1)(1,0,0)[12] intercept   : BIC=2244.383, Time=1.37 sec
 ARIMA(1,0,1)(1,0,1)[12] intercept   : BIC=inf, Time=3.12 sec
 ARIMA(1,0,1)(1,0,2)[12] intercept   : BIC=inf, Time=7.71 sec
 ARIMA(1,0,1)(2,0,0)[12] intercept   : BIC=2079.557, Time=3.49 sec
 ARIMA(1,0,1)(2,0,1)[12] intercept   : BIC=inf, Time=5.77 sec
 ARIMA(1,0,2)(0,0,0)[12] intercept   : BIC=inf, Time=1.37 sec
 ARIMA(1,0,2)(0,0,1)[12] intercept   : BIC=inf, Time=3.28 sec
 ARIMA(1,0,2)(0,0,2)[12] intercept   : BIC=inf, Time=6.33 sec
 ARIMA(1,0,2)(1,0,0)[12] intercept   : BIC=2249.681, Time=2.36 sec
 ARIMA(1,0,2)(1,0,1)[12] intercept   : BIC=inf, Time=3.49 sec
 ARIMA(1,0,2)(2,0,0)[12] intercept   : BIC=2085.903, Time=3.66 sec
 ARIMA(1,0,3)(0,0,0)[12] intercept   : BIC=inf, Time=1.25 sec
 ARIMA(1,0,3)(0,0,1)[12] intercept   : BIC=inf, Time=2.96 sec
 ARIMA(1,0,3)(1,0,0)[12] intercept   : BIC=2252.894, Time=2.39 sec
 ARIMA(1,0,4)(0,0,0)[12] intercept   : BIC=inf, Time=2.33 sec
 ARIMA(2,0,0)(0,0,0)[12] intercept   : BIC=2745.267, Time=0.31 sec
 ARIMA(2,0,0)(0,0,1)[12] intercept   : BIC=inf, Time=3.19 sec
 ARIMA(2,0,0)(0,0,2)[12] intercept   : BIC=inf, Time=16.05 sec
 ARIMA(2,0,0)(1,0,0)[12] intercept   : BIC=2460.390, Time=1.12 sec
 ARIMA(2,0,0)(1,0,1)[12] intercept   : BIC=inf, Time=3.74 sec
 ARIMA(2,0,0)(1,0,2)[12] intercept   : BIC=inf, Time=7.49 sec
 ARIMA(2,0,0)(2,0,0)[12] intercept   : BIC=2257.867, Time=6.58 sec
 ARIMA(2,0,0)(2,0,1)[12] intercept   : BIC=inf, Time=8.97 sec
 ARIMA(2,0,1)(0,0,0)[12] intercept   : BIC=inf, Time=1.51 sec
 ARIMA(2,0,1)(0,0,1)[12] intercept   : BIC=inf, Time=4.61 sec
 ARIMA(2,0,1)(0,0,2)[12] intercept   : BIC=inf, Time=7.17 sec
 ARIMA(2,0,1)(1,0,0)[12] intercept   : BIC=2248.408, Time=1.74 sec
 ARIMA(2,0,1)(1,0,1)[12] intercept   : BIC=inf, Time=4.21 sec
 ARIMA(2,0,1)(2,0,0)[12] intercept   : BIC=2085.424, Time=5.29 sec
 ARIMA(2,0,2)(0,0,0)[12] intercept   : BIC=inf, Time=2.09 sec
 ARIMA(2,0,2)(0,0,1)[12] intercept   : BIC=inf, Time=4.06 sec
 ARIMA(2,0,2)(1,0,0)[12] intercept   : BIC=inf, Time=4.56 sec
 ARIMA(2,0,3)(0,0,0)[12] intercept   : BIC=inf, Time=2.60 sec
 ARIMA(3,0,0)(0,0,0)[12] intercept   : BIC=2700.759, Time=0.63 sec
 ARIMA(3,0,0)(0,0,1)[12] intercept   : BIC=inf, Time=3.23 sec
 ARIMA(3,0,0)(0,0,2)[12] intercept   : BIC=inf, Time=6.12 sec
 ARIMA(3,0,0)(1,0,0)[12] intercept   : BIC=2409.244, Time=1.26 sec
 ARIMA(3,0,0)(1,0,1)[12] intercept   : BIC=inf, Time=4.02 sec
 ARIMA(3,0,0)(2,0,0)[12] intercept   : BIC=2212.589, Time=2.76 sec
 ARIMA(3,0,1)(0,0,0)[12] intercept   : BIC=inf, Time=2.04 sec
 ARIMA(3,0,1)(0,0,1)[12] intercept   : BIC=inf, Time=3.60 sec
 ARIMA(3,0,1)(1,0,0)[12] intercept   : BIC=2237.885, Time=2.00 sec
 ARIMA(3,0,2)(0,0,0)[12] intercept   : BIC=inf, Time=2.44 sec
 ARIMA(4,0,0)(0,0,0)[12] intercept   : BIC=2692.358, Time=0.52 sec
 ARIMA(4,0,0)(0,0,1)[12] intercept   : BIC=inf, Time=3.09 sec
 ARIMA(4,0,0)(1,0,0)[12] intercept   : BIC=2347.557, Time=1.22 sec
 ARIMA(4,0,1)(0,0,0)[12] intercept   : BIC=inf, Time=2.40 sec
 ARIMA(5,0,0)(0,0,0)[12] intercept   : BIC=2563.931, Time=0.75 sec

Best model:  ARIMA(0,0,2)(2,0,0)[12] intercept
Total fit time: 329.427 seconds
                                      SARIMAX Results                                      
===========================================================================================
Dep. Variable:                                   y   No. Observations:                 1304
Model:             SARIMAX(0, 0, 2)x(2, 0, [], 12)   Log Likelihood               -1018.079
Date:                             Tue, 11 Jun 2024   AIC                           2048.159
Time:                                     18:13:18   BIC                           2079.198
Sample:                                          0   HQIC                          2059.803
                                            - 1304                                         
Covariance Type:                               opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
intercept     -0.0002      0.003     -0.050      0.960      -0.007       0.007
ma.L1         -0.6316      0.018    -34.407      0.000      -0.668      -0.596
ma.L2         -0.1671      0.020     -8.241      0.000      -0.207      -0.127
ar.S.L12      -0.6461      0.020    -31.989      0.000      -0.686      -0.606
ar.S.L24      -0.3713      0.018    -20.690      0.000      -0.406      -0.336
sigma2         0.2770      0.005     52.297      0.000       0.267       0.287
===================================================================================
Ljung-Box (L1) (Q):                   0.06   Jarque-Bera (JB):              5419.41
Prob(Q):                              0.81   Prob(JB):                         0.00
Heteroskedasticity (H):               0.62   Skew:                            -1.23
Prob(H) (two-sided):                  0.00   Kurtosis:                        12.68
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [13]:
from statsmodels.tsa.arima.model import ARIMA
model = ARIMA(train, order=(0,0,2),
              seasonal_order=(2,0,0,12)).fit()
forecasts = model.forecast(len(test))
forecasts
C:\Users\user\AppData\Roaming\Python\Python39\site-packages\statsmodels\tsa\base\tsa_model.py:471: ValueWarning:

No frequency information was provided, so inferred frequency D will be used.

C:\Users\user\AppData\Roaming\Python\Python39\site-packages\statsmodels\tsa\base\tsa_model.py:471: ValueWarning:

No frequency information was provided, so inferred frequency D will be used.

C:\Users\user\AppData\Roaming\Python\Python39\site-packages\statsmodels\tsa\base\tsa_model.py:471: ValueWarning:

No frequency information was provided, so inferred frequency D will be used.

Out[13]:
2018-08-10    0.206257
2018-08-11   -0.853199
2018-08-12   -0.121371
2018-08-13   -0.227366
2018-08-14   -0.013188
                ...   
2019-06-26   -0.000073
2019-06-27   -0.000072
2019-06-28   -0.000073
2019-06-29   -0.000071
2019-06-30   -0.000072
Freq: D, Name: predicted_mean, Length: 325, dtype: float64
In [14]:
data_forecasts = pd.DataFrame({'forecasts': forecasts})
data_forecasts
Out[14]:
forecasts
2018-08-10 0.206257
2018-08-11 -0.853199
2018-08-12 -0.121371
2018-08-13 -0.227366
2018-08-14 -0.013188
... ...
2019-06-26 -0.000073
2019-06-27 -0.000072
2019-06-28 -0.000073
2019-06-29 -0.000071
2019-06-30 -0.000072

325 rows × 1 columns

In [15]:
from scipy.special import inv_boxcox
full_df = pd.merge(df,data_forecasts, left_on = df.index, right_on = data_forecasts.index , how = 'left' )
full_df
Out[15]:
key_0 series1 boxcox boxcox_shifted_S boxcox_shifted forecasts
0 2015-01-01 1006.699649 9.427615 NaN NaN NaN
1 2015-01-02 3197.751826 11.621358 NaN NaN NaN
2 2015-01-03 3217.491035 11.633629 NaN NaN NaN
3 2015-01-04 2151.573759 10.844711 NaN NaN NaN
4 2015-01-05 4243.929892 12.192449 NaN NaN NaN
... ... ... ... ... ... ...
1637 2019-06-26 4007.059387 12.075449 -0.228174 -0.271553 -0.000073
1638 2019-06-27 4836.106157 12.460696 0.662147 0.890321 -0.000072
1639 2019-06-28 4895.323783 12.485843 0.698671 0.036524 -0.000073
1640 2019-06-29 4086.016222 12.115136 -0.179430 -0.878101 -0.000071
1641 2019-06-30 3572.796793 11.843477 -0.054900 0.124530 -0.000072

1642 rows × 6 columns

In [16]:
full_df['boxcox_shifted_S'] = full_df.forecasts + full_df.boxcox_shifted_S.shift(1)
full_df['boxcox'] = full_df.boxcox_shifted_S + full_df.boxcox.shift(12)
full_df['beer'] = inv_boxcox(full_df.boxcox, lambda_value)
In [17]:
predict = full_df['beer'].dropna()
predict
Out[17]:
1317    3350.744376
1318    4780.633711
1319    3739.578494
1320    2928.890317
1321    4740.969386
           ...     
1637    4576.297157
1638    3116.802369
1639    4809.361048
1640    6241.050493
1641    3357.733594
Name: beer, Length: 325, dtype: float64
In [18]:
train_df = df.iloc[:-int(len(df) * 0.2)]
test_df = df.iloc[-int(len(df) * 0.2):]
In [19]:
# Import packages
import plotly.graph_objects as go

def plot_forecasts(forecasts: list[float], title: str) -> None:
    """Function to plot the forecasts."""
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=train_df.index, y=train_df['series1'], name='Train'))
    fig.add_trace(go.Scatter(x=test_df.index, y=test_df['series1'], name='Test'))
    fig.add_trace(go.Scatter(x=test_df.index, y=predict, name='Forecast'))
    fig.update_layout(template="simple_white", font=dict(size=18), title_text=title,
                      width=650, title_x=0.5, height=400, xaxis_title='Date')

    return fig.show()


# Plot the forecasts
plot_forecasts(forecasts, 'ARIMA')
In [20]:
res = test_df.iloc[:len(predict)]['series1']
res
Out[20]:
Date
2018-08-07    4322.886728
2018-08-08    4540.018025
2018-08-09    4224.190684
2018-08-10    3868.884925
2018-08-11    3691.232046
                 ...     
2019-06-23    3375.404705
2019-06-24    4401.843563
2019-06-25    3710.971255
2019-06-26    4007.059387
2019-06-27    4836.106157
Name: series1, Length: 325, dtype: float64
In [21]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Вычисление средней абсолютной ошибки (MAE)
mae = mean_absolute_error(res, predict)
print("MAE:", mae)

# Вычисление среднеквадратичной ошибки (MSE)
mse = mean_squared_error(res, predict)
print("MSE:", mse)
MAE: 869.6450418707591
MSE: 1665688.4396404284
In [ ]: